Corpus similarity

The goal of this notebook is to compare the two corpuses -- the final and the homework, to find some sort of difference between the two



In [1]:

    
# Necessary imports 
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.features import Features
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.encoders.ast_graph.ast_graph import *



In [2]:

    
# Loading in the two corpuses
notebooks = [os.path.join('../hw_corpus', fname) for fname in os.listdir('../hw_corpus')]
hw_notebook_objs = [NotebookMiner(file) for file in notebooks]

people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]



In [3]:

    
from nbminer.stats.multiple_summary import MultipleSummary
hw_summary = MultipleSummary(hw_notebook_objs)
final_summary = MultipleSummary(notebook_objs)



In [4]:

    
print("Number of Final notebooks: ", len(final_summary.summary_vec))
print("Number of Homework notebooks: ", len(hw_summary.summary_vec))









    



Number of Final notebooks:  177
Number of Homework notebooks:  464



In [5]:

    
print("Average number of cells, Final: ", final_summary.average_number_of_cells())
print("Average number of cells, Homework: ", hw_summary.average_number_of_cells())









    



Average number of cells, Final:  68.92090395480226
Average number of cells, Homework:  36.42672413793103



In [6]:

    
print("Average lines of code, Final: ", final_summary.average_lines_of_code())
print("Average lines of code, Homework: ", hw_summary.average_lines_of_code())









    



Average lines of code, Final:  271.3502824858757
Average lines of code, Homework:  197.14008620689654



In [ ]:

Combined Clustering



In [7]:

    
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
a = Features(hw_notebook_objs, 'group_1')
a.add_notebooks(notebook_objs, 'group_2')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = 100)
#agr = ASTGraphReducer(a, threshold=20, split_call=False)
njs = NotebookJaccardSimilarity()
pipe = Pipeline([gastf, rbn, gi, fe, ke, njs])
a = pipe.transform(a)









    



<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x15238d07f0>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x15238d0828>
<nbminer.preprocess.get_imports.GetImports object at 0x1a2e7d8b00>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1a2e7d8780>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1a2e81b0f0>
<nbminer.results.similarity.jaccard_similarity.NotebookJaccardSimilarity object at 0x1a2e81bda0>



In [8]:

    
import numpy as np
intra, inter = njs.group_average_jaccard_similarity('group_1')
print('Mean within group: ', np.mean(np.array(intra)))
print('STD within group: ', np.std(np.array(intra)))
print('Mean outside group: ', np.mean(np.array(inter)))
print('STD outside group: ', np.std(np.array(inter)))









    



Mean within group:  0.223739330482
STD within group:  0.0429085293912
Mean outside group:  0.220370504491
STD outside group:  0.0439395959595



In [9]:

    
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.reconstruction_error.astor_error import AstorError
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
a = Features(hw_notebook_objs, 'group_1')
a.add_notebooks(notebook_objs, 'group_2')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = 10)
#agr = ASTGraphReducer(a, threshold=20, split_call=False)
njs = NotebookJaccardSimilarity()
pipe = Pipeline([gastf, rbn, gi, fe, ke, njs])
a = pipe.transform(a)









    



<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1a43adc9b0>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x151ab42da0>
<nbminer.preprocess.get_imports.GetImports object at 0x1a406224a8>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x10a84fef0>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1a404b4978>
<nbminer.results.similarity.jaccard_similarity.NotebookJaccardSimilarity object at 0x1a3b7acb70>

Prediction of group



In [10]:

    
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
from nbminer.results.prediction.corpus_identifier import CorpusIdentifier
a = Features(hw_notebook_objs, 'group_1')
a.add_notebooks(notebook_objs, 'group_2')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
fe = FeatureEncoding()
ke = KmeansEncoder(n_clusters = 10)
#agr = ASTGraphReducer(a, threshold=20, split_call=False)
ci = CorpusIdentifier()
pipe = Pipeline([gastf, rbn, gi, fe, ke, ci])
a = pipe.transform(a)









    



<nbminer.preprocess.get_ast_features.GetASTFeatures object at 0x1a5c495198>
<nbminer.preprocess.resample_by_node.ResampleByNode object at 0x1a592c0048>
<nbminer.preprocess.get_imports.GetImports object at 0x1a592c06d8>
<nbminer.preprocess.feature_encoding.FeatureEncoding object at 0x1a5ced0358>
<nbminer.encoders.cluster.kmeans_encoder.KmeansEncoder object at 0x1a5ced01d0>
<nbminer.results.prediction.corpus_identifier.CorpusIdentifier object at 0x1a5cf31048>



In [25]:

    
%matplotlib inline
import matplotlib.pyplot as plt
fpr, tpr, m = ci.predict()
print(m)
plt.plot(fpr, tpr)









    



0.493810386473






    Out[25]:





[<matplotlib.lines.Line2D at 0x1a9641e630>]



In [3]:

    
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_simple_features import GetSimpleFeatures
from nbminer.results.prediction.corpus_identifier import CorpusIdentifier

a = Features(hw_notebook_objs, 'group_1')
a.add_notebooks(notebook_objs, 'group_2')
gsf = GetSimpleFeatures()
ci = CorpusIdentifier(feature_name='string')
pipe = Pipeline([gsf, ci])
a = pipe.transform(a)









    



<nbminer.preprocess.get_simple_features.GetSimpleFeatures object at 0x1063b9a90>
<nbminer.results.prediction.corpus_identifier.CorpusIdentifier object at 0x150ade57f0>



In [10]:

    
%matplotlib inline
import matplotlib.pyplot as plt
fpr, tpr, m = ci.predict()
print(m)
plt.plot(fpr, tpr)









    



0.42789296884






    Out[10]:





[<matplotlib.lines.Line2D at 0x151c9d1550>]



In [ ]: